{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "afdf7a60-bc81-42a1-b7e4-2bb857431695",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating synthetic asset inventory...\n",
      "Generating synthetic break records...\n",
      "Synthetic data generation complete!\n",
      "Synthetic assets saved to: C:\\Users\\apoudel\\OneDrive - City of Sugar Land\\AI Hallucination Test\\synthetic_water_asset_data.csv\n",
      "Synthetic breaks saved to: C:\\Users\\apoudel\\OneDrive - City of Sugar Land\\AI Hallucination Test\\synthetic_break_records.csv\n",
      "Number of synthetic assets: 10000\n",
      "Number of synthetic breaks: 4983\n",
      " Synthetic break rate: 49.83%\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import roc_auc_score, f1_score, classification_report\n",
    "import xgboost as xg\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "AssetData = pd.read_csv('synthetic_water_asset_data.csv')\n",
    "BreakData = pd.read_csv('synthetic_break_records.csv')\n",
    "LLMResponses = pd.read_excel('LLM_Output_for_Mainbreak_Test.xlsx')\n",
    "\n",
    "BreakCounts = BreakData['AssetID'].value_counts().reset_index()\n",
    "BreakCounts.columns = ['AssetID', 'BreakCount']\n",
    "AssetData = pd.merge(AssetData, BreakCounts, on='AssetID', how='left')\n",
    "AssetData['BreakCount'] = AssetData['BreakCount'].fillna(0)\n",
    "AssetData['HasBroken'] = (AssetData['BreakCount'] > 0).astype(int)\n",
    "\n",
    "Features = AssetData[['Material', 'Diameter', 'Age', 'Depth', 'Length']]\n",
    "Target = AssetData['HasBroken']\n",
    "\n",
    "TrainFeatures, TestFeatures, TrainTarget, TestTarget = train_test_split(Features, Target, test_size=0.2, random_state=42, stratify=Target)\n",
    "\n",
    "LogisticModel = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)\n",
    "ForestModel = RandomForestClassifier(class_weight='balanced', random_state=42)\n",
    "XGBModel = xg.XGBClassifier(scale_pos_weight=(len(TrainTarget) - sum(TrainTarget)) / sum(TrainTarget), random_state=42)\n",
    "\n",
    "LogisticModel.fit(TrainFeatures, TrainTarget)\n",
    "ForestModel.fit(TrainFeatures, TrainTarget)\n",
    "XGBModel.fit(TrainFeatures, TrainTarget)\n",
    "\n",
    "LogisticPredictions = LogisticModel.predict_proba(TestFeatures)[:, 1]\n",
    "ForestPredictions = ForestModel.predict_proba(TestFeatures)[:, 1]\n",
    "XGBPredictions = XGBModel.predict_proba(TestFeatures)[:, 1]\n",
    "\n",
    "LogisticAUC = roc_auc_score(TestTarget, LogisticPredictions)\n",
    "ForestAUC = roc_auc_score(TestTarget, ForestPredictions)\n",
    "XGBAUC = roc_auc_score(TestTarget, XGBPredictions)\n",
    "\n",
    "LogisticPredictionsBinary = (LogisticPredictions > 0.5).astype(int)\n",
    "ForestPredictionsBinary = (ForestPredictions > 0.5).astype(int)\n",
    "XGBPredictionsBinary = (XGBPredictions > 0.5).astype(int)\n",
    "\n",
    "LogisticF1 = f1_score(TestTarget, LogisticPredictionsBinary)\n",
    "ForestF1 = f1_score(TestTarget, ForestPredictionsBinary)\n",
    "XGBF1 = f1_score(TestTarget, XGBPredictionsBinary)\n",
    "\n",
    "PerformanceData = {\n",
    "    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],\n",
    "    'ROC_AUC': [LogisticAUC, ForestAUC, XGBAUC],\n",
    "    'F1_Score': [LogisticF1, ForestF1, XGBF1]\n",
    "}\n",
    "PerformanceDataFrame = pd.DataFrame(PerformanceData)\n",
    "\n",
    "ForestImportance = ForestModel.feature_importances_\n",
    "XGBImportance = XGBModel.feature_importances_\n",
    "\n",
    "FeatureNames = ['Material', 'Diameter', 'Age', 'Depth', 'Length']\n",
    "\n",
    "NormalizedForestImportance = (ForestImportance - ForestImportance.min()) / (ForestImportance.max() - ForestImportance.min())\n",
    "NormalizedXGBImportance = (XGBImportance - XGBImportance.min()) / (XGBImportance.max() - XGBImportance.min())\n",
    "\n",
    "AverageImportance = (NormalizedForestImportance + NormalizedXGBImportance) / 2\n",
    "\n",
    "GroundTruthImportance = pd.DataFrame({\n",
    "    'Feature': FeatureNames,\n",
    "    'Random_Forest_Importance': NormalizedForestImportance,\n",
    "    'XGBoost_Importance': NormalizedXGBImportance,\n",
    "    'Average_Importance': AverageImportance\n",
    "})\n",
    "\n",
    "LLMResponses['Response_Text'] = LLMResponses['Response_Text'].astype(str)\n",
    "\n",
    "TermDictionary = {\n",
    "    'length': ['length', 'long', 'segment size', 'mile', 'footage', 'extent', 'distance'],\n",
    "    'age': ['age', 'old', 'year', 'install date', 'decade', 'lifespan', 'older', 'newer'],\n",
    "    'material': ['material', 'cast iron', 'ci', 'pvc', 'ac', 'asbestos cement', 'ductile iron', 'hdpe', 'polyethylene'],\n",
    "    'diameter': ['diameter', 'size', 'inch', 'dimension', 'bore'],\n",
    "    'pressure': ['pressure', 'psi', 'water pressure', 'operating pressure']\n",
    "}\n",
    "\n",
    "TermCounts = {}\n",
    "for Category, Terms in TermDictionary.items():\n",
    "    Pattern = '|'.join(Terms)\n",
    "    TermCounts[Category] = LLMResponses['Response_Text'].str.lower().str.count(Pattern).sum()\n",
    "\n",
    "TotalMentions = sum(TermCounts.values())\n",
    "TermShares = {Category: Count / TotalMentions for Category, Count in TermCounts.items()}\n",
    "\n",
    "TermCountsSeries = pd.Series(TermCounts)\n",
    "TermSharesSeries = pd.Series(TermShares)\n",
    "\n",
    "LLMAlignmentData = pd.DataFrame({\n",
    "    'Term': TermCountsSeries.index,\n",
    "    'Mention_Count': TermCountsSeries.values,\n",
    "    'Mention_Share': TermSharesSeries.values\n",
    "})\n",
    "\n",
    "MergedImportance = pd.merge(GroundTruthImportance, LLMAlignmentData, left_on='Feature', right_on='Term')\n",
    "MergedImportance['Alignment_Ratio'] = MergedImportance['Mention_Share'] / MergedImportance['Average_Importance']\n",
    "\n",
    "AssetData['ReplacementCost'] = AssetData['Length'] * 250\n",
    "AssetData['FailureProbability'] = XGBModel.predict_proba(Features)[:, 1]\n",
    "AssetData['ExpectedFailures'] = AssetData['FailureProbability']\n",
    "AssetData['CostEffectiveness'] = AssetData['ExpectedFailures'] / AssetData['ReplacementCost']\n",
    "\n",
    "AssetData = AssetData.sort_values('CostEffectiveness', ascending=False)\n",
    "AssetData['CumulativeCost'] = AssetData['ReplacementCost'].cumsum()\n",
    "AssetData['CumulativeExpectedFailures'] = AssetData['ExpectedFailures'].cumsum()\n",
    "\n",
    "Budget = 5000000\n",
    "CostEffectiveSelection = AssetData[AssetData['CumulativeCost'] <= Budget]\n",
    "CostEffectiveFailuresPrevented = CostEffectiveSelection['CumulativeExpectedFailures'].iloc[-1]\n",
    "\n",
    "NaiveSelection = AssetData.sort_values('FailureProbability', ascending=False)\n",
    "NaiveSelection['CumulativeCost'] = NaiveSelection['ReplacementCost'].cumsum()\n",
    "NaiveSelection = NaiveSelection[NaiveSelection['CumulativeCost'] <= Budget]\n",
    "NaiveFailuresPrevented = NaiveSelection['ExpectedFailures'].sum()\n",
    "\n",
    "RandomFailuresList = []\n",
    "for i in range(1000):\n",
    "    RandomSample = AssetData.sample(n=len(CostEffectiveSelection), random_state=i)\n",
    "    RandomFailuresList.append(RandomSample['ExpectedFailures'].sum())\n",
    "RandomFailuresPrevented = np.mean(RandomFailuresList)\n",
    "RandomFailuresLow = np.percentile(RandomFailuresList, 2.5)\n",
    "RandomFailuresHigh = np.percentile(RandomFailuresList, 97.5)\n",
    "\n",
    "CapitalPlanningSummary = {\n",
    "    'Strategy': ['Cost-Effectiveness', 'Naive Priority', 'Random Selection'],\n",
    "    'Expected_Failures_Prevented': [CostEffectiveFailuresPrevented, NaiveFailuresPrevented, RandomFailuresPrevented],\n",
    "    'Budget_Utilized': [Budget, Budget, Budget],\n",
    "    'Efficiency_Ratio': [CostEffectiveFailuresPrevented/Budget, NaiveFailuresPrevented/Budget, RandomFailuresPrevented/Budget]\n",
    "}\n",
    "CapitalPlanningDataFrame = pd.DataFrame(CapitalPlanningSummary)\n",
    "\n",
    "OutputDirectory = r\"C:\\Users\\apoudel\\OneDrive - City of Sugar Land\\AI Hallucination Test\\Analysis_Outputs\"\n",
    "\n",
    "PerformanceDataFrame.to_csv(OutputDirectory + r\"\\Model_Performance.csv\", index=False)\n",
    "GroundTruthImportance.to_csv(OutputDirectory + r\"\\Ground_Truth_Feature_Importance.csv\", index=False)\n",
    "LLMAlignmentData.to_csv(OutputDirectory + r\"\\LLM_Response_Analysis.csv\", index=False)\n",
    "MergedImportance.to_csv(OutputDirectory + r\"\\Ground_Truth_LLM_Alignment.csv\", index=False)\n",
    "CapitalPlanningDataFrame.to_csv(OutputDirectory + r\"\\Capital_Planning_Comparison.csv\", index=False)\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.bar(GroundTruthImportance['Feature'], GroundTruthImportance['Average_Importance'])\n",
    "plt.title('Ground Truth Feature Importance')\n",
    "plt.ylabel('Normalized Importance Score')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.savefig(OutputDirectory + r\"\\Ground_Truth_Importance_Plot.png\")\n",
    "plt.close()\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.bar(LLMAlignmentData['Term'], LLMAlignmentData['Mention_Count'])\n",
    "plt.title('LLM Mention Frequency by Term')\n",
    "plt.ylabel('Total Mention Count')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.savefig(OutputDirectory + r\"\\LLM_Mention_Frequency_Plot.png\")\n",
    "plt.close()\n",
    "\n",
    "print(\"Analysis complete. All outputs saved to:\", OutputDirectory)\n",
    "print(\"Model Performance:\")\n",
    "print(PerformanceDataFrame)\n",
    "print(\"\\nCapital Planning Summary:\")\n",
    "print(CapitalPlanningDataFrame)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "834ab4b9-b754-486d-bb46-360c00742d97",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
